Load necessary packages.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.2
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(ggwordcloud)
## Warning: package 'ggwordcloud' was built under R version 4.3.2
Read the dataset and drop NA vlaues.
youtube_df <- read_csv("Data/cleaned_youtube_df.csv") %>%
janitor::clean_names() %>%
drop_na()
## Rows: 995 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): category, country, abbreviation, channel_type
## dbl (14): id, subscribers, video_views, uploads, video_views_for_the_last_30...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(youtube_df)
## id subscribers video_views category
## Min. : 1.0 Min. : 12300000 Min. :2.634e+03 Length:589
## 1st Qu.:213.0 1st Qu.: 14700000 1st Qu.:4.910e+09 Class :character
## Median :454.0 Median : 18600000 Median :8.761e+09 Mode :character
## Mean :471.2 Mean : 24264516 Mean :1.280e+10
## 3rd Qu.:723.0 3rd Qu.: 26400000 3rd Qu.:1.507e+10
## Max. :995.0 Max. :245000000 Max. :2.280e+11
## uploads country abbreviation channel_type
## Min. : 1 Length:589 Length:589 Length:589
## 1st Qu.: 433 Class :character Class :character Class :character
## Median : 1189 Mode :character Mode :character Mode :character
## Mean : 13945
## 3rd Qu.: 3882
## Max. :301308
## video_views_for_the_last_30_days lowest_monthly_earnings
## Min. :3.000e+00 Min. : 0
## 1st Qu.:4.845e+07 1st Qu.: 11500
## Median :1.143e+08 Median : 28200
## Mean :2.446e+08 Mean : 54260
## 3rd Qu.:2.441e+08 3rd Qu.: 59000
## Max. :6.589e+09 Max. :850900
## highest_monthly_earnings lowest_yearly_earnings highest_yearly_earnings
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 183200 1st Qu.: 137400 1st Qu.: 2200000
## Median : 451100 Median : 338300 Median : 5400000
## Mean : 867499 Mean : 650618 Mean : 10417717
## 3rd Qu.: 944000 3rd Qu.: 708000 3rd Qu.: 11300000
## Max. :13600000 Max. :10200000 Max. :163400000
## subscribers_for_last_30_days created_year population
## Min. : 1 Min. :1970 Min. :2.025e+05
## 1st Qu.: 100000 1st Qu.:2010 1st Qu.:1.081e+08
## Median : 200000 Median :2013 Median :3.282e+08
## Mean : 356695 Mean :2013 Mean :4.906e+08
## 3rd Qu.: 400000 3rd Qu.:2016 3rd Qu.:3.282e+08
## Max. :8000000 Max. :2022 Max. :1.398e+09
## latitude longitude
## Min. :-38.42 Min. :-172.105
## 1st Qu.: 20.59 1st Qu.: -95.713
## Median : 30.59 Median : -3.436
## Mean : 26.21 Mean : -6.254
## 3rd Qu.: 37.09 3rd Qu.: 78.963
## Max. : 61.52 Max. : 138.253
We use histogram and density plot to intuitively visualize the distribution of different variables.
all_columns <- colnames(youtube_df)
columns_to_plot <- all_columns[!all_columns %in% c("id", "category","country","abbreviation","channel_type","population","latitude","longitude","created_year")]
numeric_data_long <-
youtube_df[, columns_to_plot] %>%
gather(key = "variable", value = "value")
# Create a single plot with facets for each numeric variable
p <- ggplot(numeric_data_long, aes(x = value)) +
geom_histogram(aes(y = ..density..),bins = 15, fill = "#8dab7f", alpha = 0.8) +
geom_density(color="#6b8e23")+
facet_wrap(~ variable, scales = "free", ncol = 3) +
scale_x_continuous(labels = scales::comma) +
theme_minimal(base_size = 10) +
theme(
strip.text.x = element_text(size = 10, face = "bold"),
axis.text.x = element_text(angle = 20, hjust = 1, vjust = 1,size=7,face = "bold"), # Angle x-axis labels for readability
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
plot.title = element_text(size = 16, face = "bold"),
plot.margin = margin(1, 1, 1, 1, "cm"), # Adjust the plot margins
strip.background = element_blank(),
panel.spacing = unit(3, "lines")
) +
labs(
title = "Distribution of Numeric Variables",
x = "Value",
y = "Frequency",
caption = "Source: YouTube Data"
)
# Convert to an interactive plot
ggplotly(p)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## ℹ The deprecated feature was likely used in the ggplot2 package.
## Please report the issue at <https://github.com/tidyverse/ggplot2/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
We plot heatmap to study the relationships among numerical variables.
# Calculate the correlation matrix
cor_matrix <- cor(youtube_df[, columns_to_plot], use = "complete.obs")
fig <- plot_ly(x = colnames(cor_matrix), y = rownames(cor_matrix), z = cor_matrix,
type = "heatmap",colorscale ="Greens" , zmin = -1, zmax = 1)
fig <- fig %>% layout(
yaxis = list(autorange = "reversed"),
width=800,
height=600,
title = "Correlation Matrix")
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
fig
Wordcloud for channel category.
category_data <- youtube_df %>%
filter(!is.na(category) & category != "nan") %>%
count(category) %>%
mutate(n=n*30) %>%
ungroup()
category_data$scaled_size <- log(category_data$n + 1) # adding 1 to avoid log(0)
wordcloud_plot <- ggplot(category_data, aes(label = category, size = scaled_size)) +
geom_text_wordcloud(
aes(color = n),
shape = 'circle',
rm_outside = TRUE
) +
scale_size_area(max_size = 10) +
scale_color_gradient(low = "#ffcc99", high = "#8dab7f") +
theme_void(base_family = "sans") +
theme(legend.position = "none",
plot.margin = margin(1, 1, 1, 1, "cm")) # Adjust margins around the plot
# Display the plot
wordcloud_plot
Pie chart for channel type.
channel_type_counts <- table(youtube_df$channel_type)
channel_type_counts <- youtube_df %>%
group_by(channel_type) %>%
summarise(count = n()) %>%
ungroup()
color <- c("#ffcc99","#ffe4b5", "#ffd180","#ffa07a","#d1d17a", "#8dab7f", "#D2DFD9", "#A8C0B5", "#D1B9CB", "#B3CDD1", "#BBC1D0", "#E8C3C3","#C7CEBD", "#D2DFD9","#6b8e23")
# Create a pie chart using plotly with the custom colors
fig <- plot_ly(channel_type_counts, labels = ~channel_type, values = ~count, type = 'pie',
textinfo = 'label+percent',
insidetextorientation = 'radial',
marker = list(colors = color))
fig %>%
layout(title = 'Pie Chart of Channel Types',
showlegend = FALSE,
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))